[ARM] Replace ABS and tABS machine nodes with custom lowering #156717

AZero13 · 2025-09-03T17:04:01Z

Just do a custom lowering instead.

Also copy paste the cmov-neg fold to prevent regressions in nabs.

llvmbot · 2025-09-03T17:04:34Z

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-backend-arm

Author: AZero13 (AZero13)

Changes

Just do a custom lowering instead.

Also copy paste the cmov-neg fold to prevent regressions in nabs.

Full diff: https://github.com/llvm/llvm-project/pull/156717.diff

9 Files Affected:

(modified) llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp (-48)
(modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+64-83)
(modified) llvm/lib/Target/ARM/ARMISelLowering.h (+1)
(modified) llvm/lib/Target/ARM/ARMInstrInfo.td (-6)
(modified) llvm/lib/Target/ARM/ARMInstrThumb2.td (-7)
(modified) llvm/lib/Target/ARM/ARMScheduleA57.td (+1-1)
(modified) llvm/lib/Target/ARM/ARMScheduleR52.td (+1-1)
(modified) llvm/lib/Target/ARM/ARMScheduleSwift.td (-3)
(modified) llvm/test/CodeGen/ARM/iabs.ll (+11)

diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9ad46df159c20..847b7af5a9b11 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -306,9 +306,6 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
 
   bool tryInsertVectorElt(SDNode *N);
 
-  // Select special operations if node forms integer ABS pattern
-  bool tryABSOp(SDNode *N);
-
   bool tryReadRegister(SDNode *N);
   bool tryWriteRegister(SDNode *N);
 
@@ -3459,45 +3456,6 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   return false;
 }
 
-/// Target-specific DAG combining for ISD::SUB.
-/// Target-independent combining lowers SELECT_CC nodes of the form
-/// select_cc setg[ge] X,  0,  X, -X
-/// select_cc setgt    X, -1,  X, -X
-/// select_cc setl[te] X,  0, -X,  X
-/// select_cc setlt    X,  1, -X,  X
-/// which represent Integer ABS into:
-/// Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
-/// ARM instruction selection detects the latter and matches it to
-/// ARM::ABS or ARM::t2ABS machine node.
-bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
-  SDValue SUBSrc0 = N->getOperand(0);
-  SDValue SUBSrc1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-
-  if (Subtarget->isThumb1Only())
-    return false;
-
-  if (SUBSrc0.getOpcode() != ISD::XOR || SUBSrc1.getOpcode() != ISD::SRA)
-    return false;
-
-  SDValue XORSrc0 = SUBSrc0.getOperand(0);
-  SDValue XORSrc1 = SUBSrc0.getOperand(1);
-  SDValue SRASrc0 = SUBSrc1.getOperand(0);
-  SDValue SRASrc1 = SUBSrc1.getOperand(1);
-  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
-  EVT XType = SRASrc0.getValueType();
-  unsigned Size = XType.getSizeInBits() - 1;
-
-  if (XORSrc1 == SUBSrc1 && XORSrc0 == SRASrc0 && XType.isInteger() &&
-      SRAConstant != nullptr && Size == SRAConstant->getZExtValue()) {
-    unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    CurDAG->SelectNodeTo(N, Opcode, VT, XORSrc0);
-    return true;
-  }
-
-  return false;
-}
-
 /// We've got special pseudo-instructions for these
 void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
@@ -3685,12 +3643,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (tryInlineAsm(N))
       return;
     break;
-  case ISD::SUB:
-    // Select special operations if SUB node forms integer ABS pattern
-    if (tryABSOp(N))
-      return;
-    // Other cases are autogenerated.
-    break;
   case ISD::Constant: {
     unsigned Val = N->getAsZExtVal();
     // If we can't materialize the constant we need to use a literal pool
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b5c01eafcf108..fbc618473cac7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -647,6 +647,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   if (!Subtarget->hasV8_1MMainlineOps())
     setOperationAction(ISD::UCMP, MVT::i32, Custom);
 
+  if (!Subtarget->isThumb1Only())
+    setOperationAction(ISD::ABS, MVT::i32, Custom);
+
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
@@ -10700,9 +10703,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UCMP:
   case ISD::SCMP:
     return LowerCMP(Op, DAG);
+  case ISD::ABS:
+    return LowerABS(Op, DAG);
   }
 }
 
+// Generate CMP + CMOV for integer abs.
+SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
+  // Generate CMP & CMOV.
+  SDLoc DL(Op);
+
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, MVT::i32,
+                            DAG.getConstant(0, DL, MVT::i32), Op.getOperand(0));
+
+  SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
+                            DAG.getConstant(0, DL, MVT::i32));
+  return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
+                     DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
+}
+
 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG) {
   unsigned IntNo = N->getConstantOperandVal(0);
@@ -12285,89 +12304,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case ARM::Int_eh_sjlj_setup_dispatch:
     EmitSjLjDispatchBlock(MI, BB);
     return BB;
-
-  case ARM::ABS:
-  case ARM::t2ABS: {
-    // To insert an ABS instruction, we have to insert the
-    // diamond control-flow pattern.  The incoming instruction knows the
-    // source vreg to test against 0, the destination vreg to set,
-    // the condition code register to branch on, the
-    // true/false values to select between, and a branch opcode to use.
-    // It transforms
-    //     V1 = ABS V0
-    // into
-    //     V2 = MOVS V0
-    //     BCC                      (branch to SinkBB if V0 >= 0)
-    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
-    //     SinkBB: V1 = PHI(V2, V3)
-    const BasicBlock *LLVM_BB = BB->getBasicBlock();
-    MachineFunction::iterator BBI = ++BB->getIterator();
-    MachineFunction *Fn = BB->getParent();
-    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
-    Fn->insert(BBI, RSBBB);
-    Fn->insert(BBI, SinkBB);
-
-    // Set the call frame size on entry to the new basic blocks.
-    unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
-    RSBBB->setCallFrameSize(CallFrameSize);
-    SinkBB->setCallFrameSize(CallFrameSize);
-
-    Register ABSSrcReg = MI.getOperand(1).getReg();
-    Register ABSDstReg = MI.getOperand(0).getReg();
-    bool ABSSrcKIll = MI.getOperand(1).isKill();
-    bool isThumb2 = Subtarget->isThumb2();
-    MachineRegisterInfo &MRI = Fn->getRegInfo();
-    // In Thumb mode S must not be specified if source register is the SP or
-    // PC and if destination register is the SP, so restrict register class
-    Register NewRsbDstReg = MRI.createVirtualRegister(
-        isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
-
-    // Transfer the remainder of BB and its successor edges to sinkMBB.
-    SinkBB->splice(SinkBB->begin(), BB,
-                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
-    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
-
-    BB->addSuccessor(RSBBB);
-    BB->addSuccessor(SinkBB);
-
-    // fall through to SinkMBB
-    RSBBB->addSuccessor(SinkBB);
-
-    // insert a cmp at the end of BB
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-        .addReg(ABSSrcReg)
-        .addImm(0)
-        .add(predOps(ARMCC::AL));
-
-    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
-    BuildMI(BB, dl,
-      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
-      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
-
-    // insert rsbri in RSBBB
-    // Note: BCC and rsbri will be converted into predicated rsbmi
-    // by if-conversion pass
-    BuildMI(*RSBBB, RSBBB->begin(), dl,
-            TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-        .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
-        .addImm(0)
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
-
-    // insert PHI in SinkBB,
-    // reuse ABSDstReg to not change uses of ABS instruction
-    BuildMI(*SinkBB, SinkBB->begin(), dl,
-      TII->get(ARM::PHI), ABSDstReg)
-      .addReg(NewRsbDstReg).addMBB(RSBBB)
-      .addReg(ABSSrcReg).addMBB(BB);
-
-    // remove ABS instruction
-    MI.eraseFromParent();
-
-    // return last added BB
-    return SinkBB;
-  }
   case ARM::COPY_STRUCT_BYVAL_I32:
     ++NumLoopByVals;
     return EmitStructByval(MI, BB);
@@ -14079,6 +14015,48 @@ static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
                      CSINC.getOperand(3));
 }
 
+static bool isNegatedInteger(SDValue Op) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
+}
+
+static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
+}
+
+// Try to fold
+//
+// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
+//
+// The folding helps cmov to be matched with csneg without generating
+// redundant neg instruction.
+static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG) {
+  if (!isNegatedInteger(SDValue(N, 0)))
+    return SDValue();
+
+  SDValue CSel = N->getOperand(1);
+  if (CSel.getOpcode() != ARMISD::CMOV || !CSel->hasOneUse())
+    return SDValue();
+
+  SDValue N0 = CSel.getOperand(0);
+  SDValue N1 = CSel.getOperand(1);
+
+  // If both of them is not negations, it's not worth the folding as it
+  // introduces two additional negations while reducing one negation.
+  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
+    return SDValue();
+
+  SDValue N0N = getNegatedInteger(N0, DAG);
+  SDValue N1N = getNegatedInteger(N1, DAG);
+
+  SDLoc DL(N);
+  EVT VT = CSel.getValueType();
+  return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CSel.getOperand(2),
+                     CSel.getOperand(3));
+}
+
 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
 ///
 static SDValue PerformSUBCombine(SDNode *N,
@@ -14095,6 +14073,9 @@ static SDValue PerformSUBCombine(SDNode *N,
   if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
     return R;
 
+  if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
+    return Val;
+
   if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
     return SDValue();
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 196ecb1b9f678..4d5c2e4a51170 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -906,6 +906,7 @@ class VectorType;
                    SelectionDAG &DAG) const;
     SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index bdb16d7d39266..282ff534fc112 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -5185,12 +5185,6 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>,
   let hasSideEffects = 1;
 }
 
-let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
-  // Pseudo instruction that combines movs + predicated rsbmi
-  // to implement integer ABS
-  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
-}
-
 let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
     def COPY_STRUCT_BYVAL_I32 : PseudoInst<
       (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index c00d616670b5a..c229c8e4491df 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -4393,13 +4393,6 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
 
-// Pseudo instruction that combines movs + predicated rsbmi
-// to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
-def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
-                       NoItinerary, []>, Requires<[IsThumb2]>;
-}
-
 //===----------------------------------------------------------------------===//
 // Coprocessor load/store -- for disassembly only
 //
diff --git a/llvm/lib/Target/ARM/ARMScheduleA57.td b/llvm/lib/Target/ARM/ARMScheduleA57.td
index 3baac6b233c45..e20bbadbc8eee 100644
--- a/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -137,7 +137,7 @@ def : InstRW<[WriteNoop, WriteNoop], (instregex
 def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
 
 // Pseudos
-def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
+def : InstRW<[WriteNoop], (instregex
   "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
   "tLDRpci_pic", "(t2)?SUBS_PC_LR",
   "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
diff --git a/llvm/lib/Target/ARM/ARMScheduleR52.td b/llvm/lib/Target/ARM/ARMScheduleR52.td
index e85646915117c..c350180baa250 100644
--- a/llvm/lib/Target/ARM/ARMScheduleR52.td
+++ b/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -235,7 +235,7 @@ def : InstRW< [R52WriteALU_WRI, R52Read_EX1, R52Read_EX1],
       "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
       "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
       "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
-      "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX","t2ABS")>;
+      "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
 
 // Parallel arithmetic
 def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
diff --git a/llvm/lib/Target/ARM/ARMScheduleSwift.td b/llvm/lib/Target/ARM/ARMScheduleSwift.td
index 88682f5c0d2c4..8d53119f4747b 100644
--- a/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -1081,9 +1081,6 @@ let SchedModel = SwiftModel in {
   def : WriteRes<WriteVST2, []>;
   def : WriteRes<WriteVST3, []>;
   def : WriteRes<WriteVST4, []>;
-
-  // Not specified.
-  def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
   // Preload.
   def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
     let ReleaseAtCycles = [0];
diff --git a/llvm/test/CodeGen/ARM/iabs.ll b/llvm/test/CodeGen/ARM/iabs.ll
index 758fe7507c0b2..56d016967c8c0 100644
--- a/llvm/test/CodeGen/ARM/iabs.ll
+++ b/llvm/test/CodeGen/ARM/iabs.ll
@@ -18,6 +18,17 @@ define i32 @test(i32 %a) {
   ret i32 %abs
 }
 
+define dso_local i32 @test_nabs(i32 %a) {
+; CHECK-LABEL: test_nabs:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    rsbpl r0, r0, #0
+; CHECK-NEXT:    bx lr
+  %nabs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %cond = sub i32 0, %nabs
+  ret i32 %cond
+}
+
 ; rdar://11633193
 ;; 3 instructions will be generated for abs(a-b):
 ;;   subs

AZero13 · 2025-09-03T19:21:52Z

@davemgreen

llvm/lib/Target/ARM/ARMISelLowering.cpp

davemgreen

This sounds like a nice cleanup. I've wondered why the default lowering used these ABS pseudos. It is a shame it needs custom lowers, that can have downsides, but in this case with the custom lowering being so odd and the midend already canonicalizing to abs instructions it probably makes sense.

Can you update the costs to match the expansion?

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/ARM/t2abs-killflags.ll

AZero13 · 2025-09-07T14:21:30Z

This sounds like a nice cleanup. I've wondered why the default lowering used these ABS pseudos. It is a shame it needs custom lowers, that can have downsides, but in this case with the custom lowering being so odd and the midend already canonicalizing to abs instructions it probably makes sense.

Can you update the costs to match the expansion?

Yeah I thought that is what happend automatically.

AZero13 · 2025-09-07T22:45:21Z

This sounds like a nice cleanup. I've wondered why the default lowering used these ABS pseudos. It is a shame it needs custom lowers, that can have downsides, but in this case with the custom lowering being so odd and the midend already canonicalizing to abs instructions it probably makes sense.

Can you update the costs to match the expansion?

Already done did you see?

davemgreen · 2025-09-08T21:38:17Z

Can you update the costs to match the expansion?

Already done did you see?

I meant to update the costs it produces. Some of them are debatable but the old numbers look a little better to me. The new ones are just "2 * the legalization factor" as the node is custom.

AZero13 · 2025-09-08T23:04:29Z

Can you update the costs to match the expansion?

Already done did you see?

I meant to update the costs it produces. Some of them are debatable but the old numbers look a little better to me. The new ones are just "2 * the legalization factor" as the node is custom.

Is 2 not accurate since it is just cmp+cmov?

davemgreen · 2025-09-10T07:18:05Z

Is 2 not accurate since it is just cmp+cmov?

Yeah It depends on how much a cmp costs. The BasicTTI seems to use ICmp+Select+Sub costs, even if the default expansion is via asr+xor+sub. The cost of i64 seems too low, and the costsize costs of the others you could argue should be higher than 2.

AZero13 · 2025-09-10T12:38:18Z

Don't all ALU instructions on arm with a few exceptions cost the same?

AZero13 · 2025-09-11T21:58:57Z

Is 2 not accurate since it is just cmp+cmov?

Yeah It depends on how much a cmp costs. The BasicTTI seems to use ICmp+Select+Sub costs, even if the default expansion is via asr+xor+sub. The cost of i64 seems too low, and the costsize costs of the others you could argue should be higher than 2.

Well, ICmp+Select+Sub is the custom lowering we are going with. We are not doing that in THUMB.

AZero13 · 2025-09-11T22:09:45Z

Is 2 not accurate since it is just cmp+cmov?

Yeah It depends on how much a cmp costs. The BasicTTI seems to use ICmp+Select+Sub costs, even if the default expansion is via asr+xor+sub. The cost of i64 seems too low, and the costsize costs of the others you could argue should be higher than 2.

I checked, 4 is right for i64.

davemgreen · 2025-09-12T16:41:43Z

Don't all ALU instructions on arm with a few exceptions cost the same?

My understanding is that is depends on the cpu. Most will perform the sub r0, r1, r2 in a single cycles for example, but sub r0, r1, r2, asr 31 will take 2. You can see some examples in, for example llvm/test/tools/llvm-mca/ARM/m55-int.s, but I'm not sure how accurate they all are. The with-shift instructions are also larger t2 instructions as opposed to 16bit T1 instructions, but I'm not sure how well we model that at the moment. There is some details on A-profile cores in the relevant software optimization guides.

AZero13 · 2025-09-14T15:59:53Z

We don't have to worry about T1 since the behavior for that is unchanged.

AZero13 · 2025-09-14T20:47:21Z

This sounds like a nice cleanup. I've wondered why the default lowering used these ABS pseudos. It is a shame it needs custom lowers, that can have downsides, but in this case with the custom lowering being so odd and the midend already canonicalizing to abs instructions it probably makes sense.

Can you update the costs to match the expansion?

Expansion costs seem accurate. Not only does it match how it is done for AArch64, but a drop from 5 to 4 when it is in fact 4 in codesize, Lat, and throughput makes sense. Even if what you were saying is right, the original cost of 5 isn't accurate anyway.

AZero13 · 2025-09-15T15:21:45Z

Anyway, the costs are fine and more accurate now. if we want to worry about cycles, that should be in another PR because I do not know how to do this without having it apply to all intrinsics that involve shifted operands. @davemgreen.

davemgreen · 2025-09-17T16:12:57Z

We don't have to worry about T1 since the behavior for that is unchanged.

Yeah everything I said was for the most part referring to Thumb2.

Anyway, the costs are fine and more accurate now. if we want to worry about cycles, that should be in another PR because I do not know how to do this without having it apply to all intrinsics that involve shifted operands. @davemgreen.

It would be ARMTTIImpl::getIntrinsicInstrCost that needs to be adjusted, to add a better cost for scalar instructions. It seems a bit off to me still, in terms of number of instructions and what I consider to be high-cost, but it's probably not far off.

Can you rebase the patch?

AZero13 · 2025-09-17T17:24:42Z

@davemgreen done!

Just do a custom lowering instead. Also copy paste the cmov-neg fold to prevent regressions in nabs.

davemgreen

Thanks. Lets go with this and we can alter it if needed. LGTM.

AZero13 · 2025-09-19T15:59:20Z

Thanks. Lets go with this and we can alter it if needed. LGTM.

Thank you. Can you please merge? @davemgreen

AZero13 · 2025-09-19T16:24:20Z

I do not have merge permissions. I am asking a reviewer to merge it on my behalf.

davemgreen · 2025-09-19T18:43:25Z

Some PowerPC tests failed when I tried it, but I'm pretty sure those are unrelated to this patch. Please let us know if anything does go wrong. Thanks

llvmbot added the backend:ARM label Sep 3, 2025

AZero13 force-pushed the expand-legal branch from 1baf444 to 9b4cbdf Compare September 3, 2025 17:08

AZero13 changed the title ~~[ARM] Replace ABS abd tABS machine nodes with custom lowering~~ [ARM] Replace ABS and tABS machine nodes with custom lowering Sep 3, 2025

AZero13 force-pushed the expand-legal branch from 9b4cbdf to 597b609 Compare September 3, 2025 19:12

llvmbot added the llvm:analysis Includes value tracking, cost tables and constant folding label Sep 3, 2025

AZero13 force-pushed the expand-legal branch from 597b609 to 4195c6e Compare September 3, 2025 19:17

AZero13 force-pushed the expand-legal branch from 4195c6e to b690874 Compare September 3, 2025 20:07

llvmbot added the backend:AArch64 label Sep 3, 2025

AZero13 force-pushed the expand-legal branch 2 times, most recently from 6667cde to 247f098 Compare September 3, 2025 21:41

s-barannikov reviewed Sep 3, 2025

View reviewed changes

llvm/lib/Target/ARM/ARMISelLowering.cpp Outdated Show resolved Hide resolved

AZero13 force-pushed the expand-legal branch 2 times, most recently from 3cdbc15 to 5864dc2 Compare September 4, 2025 00:10

AZero13 requested a review from s-barannikov September 4, 2025 00:10

hstk30-hw requested a review from davemgreen September 4, 2025 01:35

davemgreen reviewed Sep 7, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

llvm/test/CodeGen/ARM/t2abs-killflags.ll Outdated Show resolved Hide resolved

AZero13 force-pushed the expand-legal branch from 5864dc2 to 1622c62 Compare September 7, 2025 14:24

AZero13 requested a review from davemgreen September 7, 2025 14:24

AZero13 force-pushed the expand-legal branch from 1622c62 to 86aa036 Compare September 14, 2025 19:10

AZero13 force-pushed the expand-legal branch 3 times, most recently from 109ef95 to 04cebf3 Compare September 15, 2025 14:04

AZero13 force-pushed the expand-legal branch from 04cebf3 to 5ae3fce Compare September 17, 2025 17:23

AZero13 force-pushed the expand-legal branch from 5ae3fce to 936a18f Compare September 17, 2025 19:06

[ARM] Replace ABS and tABS machine nodes with custom lowering

d669021

Just do a custom lowering instead. Also copy paste the cmov-neg fold to prevent regressions in nabs.

AZero13 force-pushed the expand-legal branch from 936a18f to d669021 Compare September 18, 2025 19:25

davemgreen approved these changes Sep 19, 2025

View reviewed changes

davemgreen merged commit 733c1ad into llvm:main Sep 19, 2025
9 checks passed

AZero13 deleted the expand-legal branch September 19, 2025 19:02

[ARM] Replace ABS and tABS machine nodes with custom lowering #156717

[ARM] Replace ABS and tABS machine nodes with custom lowering #156717

Uh oh!

Conversation

AZero13 commented Sep 3, 2025

Uh oh!

llvmbot commented Sep 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AZero13 commented Sep 3, 2025

Uh oh!

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

AZero13 commented Sep 7, 2025

Uh oh!

AZero13 commented Sep 7, 2025

Uh oh!

davemgreen commented Sep 8, 2025

Uh oh!

AZero13 commented Sep 8, 2025

Uh oh!

davemgreen commented Sep 10, 2025

Uh oh!

AZero13 commented Sep 10, 2025

Uh oh!

AZero13 commented Sep 11, 2025

Uh oh!

AZero13 commented Sep 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Sep 12, 2025

Uh oh!

AZero13 commented Sep 14, 2025

Uh oh!

AZero13 commented Sep 14, 2025

Uh oh!

AZero13 commented Sep 15, 2025

Uh oh!

davemgreen commented Sep 17, 2025

Uh oh!

AZero13 commented Sep 17, 2025

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

AZero13 commented Sep 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AZero13 commented Sep 19, 2025

Uh oh!

davemgreen commented Sep 19, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

llvmbot commented Sep 3, 2025 •

edited

Loading

AZero13 commented Sep 11, 2025 •

edited

Loading

AZero13 commented Sep 19, 2025 •

edited

Loading